This is analysis of mutational signatures in COAD and STAD datasets from ICGC for MMR paper (Meier, Volkova et al. 2017).
Preparation
source('plotting functions/plot_sigs.R')
source("nmSolve.R")
source("plotting functions/plot_decomposition.R")
source('plotting functions/scatterpie.R')
library(tsne)
library(NMF)
library(devtools)
#devtools::install_github("mg14/mg14") # for plotting purposes
library(mg14)
library(ggplot2)
library(reshape2)
library(VariantAnnotation)
library(deconstructSigs)
Get COSMIC signatures
sp_url <- paste("http://cancer.sanger.ac.uk/cancergenome/assets/",
"signatures_probabilities.txt", sep = "")
cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE)
cancer_signatures = cancer_signatures[order(cancer_signatures[,1]),]
types <- as.character(cancer_signatures$Trinucleotide) # trinucleotide classes
types.full <- as.character(cancer_signatures$Somatic.Mutation.Type) # substitution types
row.names(cancer_signatures) <- types.full
cancer_signatures = as.matrix(cancer_signatures[,4:33])
Adjust them from exomes:
cancer_signatures_adj <- cancer_signatures
for (i in 1:ncol(cancer_signatures)) {
cancer_signatures_adj[,i] <- cancer_signatures_adj[,i] / tri.counts.genome[types,1] * tri.counts.exome[types,1]
cancer_signatures_adj[,i] <- cancer_signatures_adj[,i] / sum(cancer_signatures_adj[,i])
}
Prepare human exome counts (regions taken from Agilent SureSelect V5 Human All Exon, human genome sequence - from hg19 build)
ref_genome="BSgenome.Hsapiens.UCSC.hg19"
library(BSgenome.Hsapiens.UCSC.hg19)
exome <- read.table("S04380110_Covered.bed",header = FALSE, sep="\t",stringsAsFactors=FALSE, skip=2,quote="")
a <- getSeq(get(ref_genome))
a <- a[1:23] # get rid of Y
gr <- as(seqinfo(a), "GRanges") # turn into GRanges object
genome(gr) <- "hg19"
# Get the sequence for well covered exome
exactexomelist <- list()
for (j in 1:23) {
tmp <- exome[exome$V1==seqlevels(a)[j],]
exactexomelist[[j]] <- lapply(1:nrow(tmp), function(i)
a[[j]][tmp$V2[i]:tmp$V3[i]])
}
Prepare C. elegans counts in order to account for trinucleotide content difference:
WBcel235 <- readDNAStringSet("C. elegans data/Caenorhabditis_elegans.WBcel235.dna_sm.toplevel.fa.gz")
worm.trinucleotides <- colSums(trinucleotideFrequency(WBcel235)[-5,])
human.trinucleotides <- as.vector(t(tri.counts.genome))
names(human.trinucleotides) <- row.names(tri.counts.genome)
trinucleotide.freq.factor <- sapply(unique(types), function(x) {
freq.worm <- worm.trinucleotides[x] + worm.trinucleotides[as.character(reverseComplement(DNAString(x)))]
return(freq.worm / human.trinucleotides[x]) # tri.counts.genome is already classified w.r.t. pyrimidine reference
})
human.trinucleotides <- as.vector(t(tri.counts.exome)) # / sum(tri.counts.genome))) # counts from "deconstructSigs" package
names(human.trinucleotides) <- row.names(tri.counts.exome)
trinucleotide.freq.factor.ex <- sapply(unique(types), function(x) {
freq.worm <- worm.trinucleotides[x] + worm.trinucleotides[as.character(reverseComplement(DNAString(x)))]
return(freq.worm / human.trinucleotides[x]) # tri.counts.genome is already classified w.r.t. pyrimidine reference
})
names(trinucleotide.freq.factor.ex) = names(trinucleotide.freq.factor) <- unique(types)
Visualize trinucleotide differences between C.elegans and human exome:
Upload the mutation counts from C. elegans samples and calculate C. elegans signatures using additive Poisson model:
\(Y_{i,j} = Pois(\lambda_{i,j})\),
\(E[Y_{i,j}] = N \cdot (\beta_{j,b} + X_{g_{1}} \beta_{j,g_{1}} + X_{g_{2}} \beta_{j,g_{2}} + X_{g_{1}:g_{2}} \beta_{j,g_{1}:g_{2}})\),
where \(β_{j,\cdot} \ge 0\) - effects, \(N\) - generation number, \(g_{1}\), \(g_{2}\) - genetic backgrounds, \(b\) - background contribution, \(X_{...} \in {0,1}\) indicates the presence of particular factors.
load("C. elegans data/Learned_signatures.RData")
# Contains mutation counts matrix mut_mat and exposure matrix small.X for the samples from C. elegans MMR and pole-4;pms-2 experiments.
learned.sigs <- nmSolve(t(mut_mat),small.X,maxIter=10000, tol = 1e-06, div.err = 1e-10)
for (i in 1:nrow(learned.sigs)) {
learned.sigs[i,] <- learned.sigs[i,] / sum(learned.sigs[i,])
}
plot_sig(t(learned.sigs))
Humanize the signatures:
learned.sigs.exome <- learned.sigs
for (i in 1:nrow(learned.sigs.exome)) {
learned.sigs.exome[i,] <- learned.sigs.exome[i,] / trinucleotide.freq.factor.ex[types]
}
plot_sig(t(learned.sigs.exome))
ICGC data for COAD and STAD dataset was downloaded from ICGC DCC using COAD-US and STAD-US projects with WXS analysis type, respectively. The vcf file describing all somatic mutations across ICGC dataset is stored here under the ‘simple_somatic_mutation.aggregated.vcf.gz’ name.
# download and read huge aggregated variant file
big.vcf <- readVcf("simple_somatic_mutation.aggregated.vcf")
mutations.COAD <- read.table(file="COAD/simple_somatic_mutation.open.tsv",sep="\t",header=T)
mutations.STAD <- read.table(file="STAD/simple_somatic_mutation.open.tsv",sep="\t",header=T)
per_sample_list <- sapply(unique(mutations.COAD$icgc_donor_id), function(donor) {
unique(mutations.COAD$icgc_mutation_id[mutations.COAD$icgc_donor_id==donor])
})
names(per_sample_list) <- unique(mutations.COAD$icgc_donor_id)
vcf_list_COAD <- sapply(per_sample_list, function(x) big.vcf[as.character(x)])
names(vcf_list_COAD) <- names(per_sample_list)
vcf_list_COAD <- sapply(vcf_list_COAD, rowRanges)
for (sample in names(vcf_list_COAD)) {
seqlevels(vcf_list_COAD[[sample]]) <- seqnames(get(ref_genome))[c(as.numeric(seqlevels(vcf_list_COAD[[sample]])[1:22]),25,23,24)]
vcf_list_COAD[[sample]] <- vcf_list_COAD[[sample]][seqnames(vcf_list_COAD[[sample]])!="chrM"]
vcf_list_COAD[[sample]] <- vcf_list_COAD[[sample]][seqnames(vcf_list_COAD[[sample]])!="chrY"]
}
per_sample_list <- sapply(unique(mutations.STAD$icgc_donor_id), function(donor) {
unique(mutations.STAD$icgc_mutation_id[mutations.STAD$icgc_donor_id==donor])
})
names(per_sample_list) <- unique(mutations.STAD$icgc_donor_id)
vcf_list_STAD <- sapply(per_sample_list, function(x) big.vcf[as.character(x)])
names(vcf_list_STAD) <- names(per_sample_list)
vcf_list_STAD <- sapply(vcf_list_STAD, rowRanges)
for (sample in names(vcf_list_STAD)) {
seqlevels(vcf_list_STAD[[sample]]) <- seqnames(get(ref_genome))[c(as.numeric(seqlevels(vcf_list_STAD[[sample]])[1:22]),25,23,24)]
vcf_list_STAD[[sample]] <- vcf_list_STAD[[sample]][seqnames(vcf_list_STAD[[sample]])!="chrM"]
vcf_list_STAD[[sample]] <- vcf_list_STAD[[sample]][seqnames(vcf_list_STAD[[sample]])!="chrY"]
}
all.types <- c(types.full,"INS_A","INS_C","INS_G","INS_T","DEL_A","DEL_C","DEL_G","DEL_T")
sub_list <- sapply(vcf_list_COAD, function(vcf) {
vcf[width(vcf$REF)==1 & width(unlist(vcf$ALT))==1,]
})
del_list <- sapply(vcf_list_COAD, function(vcf) {
vcf[width(vcf$REF)==2 & width(unlist(vcf$ALT))==1,]
})
ins_list <- sapply(vcf_list_COAD, function(vcf) {
vcf[width(vcf$REF)==1 & width(unlist(vcf$ALT))==2,]
})
COAD.mutation.counts = matrix(0,nrow=length(vcf_list_COAD),ncol=length(all.types),dimnames=list(names(vcf_list_COAD),all.types))
for (i in 1:nrow(COAD.mutation.counts)) {
type_context = type_context(sub_list[[i]], ref_genome)
counts <- table(type_context)
for (a in rownames(counts)) {
tmp = unlist(strsplit(a,split="[>]"))
inds <- colnames(counts)[counts[a,]>0]
columns = as.vector(sapply(inds, function(x) paste(substr(x,1,1),"[",a,"]",substr(x,nchar(x),nchar(x)),sep="")))
COAD.mutation.counts[i,columns] = counts[a,inds]
}
COAD.mutation.counts[i,97:100] <- table(substr(unlist(ins_list[[i]]$ALT),2,2))[c("A","C","G","T")]
COAD.mutation.counts[i,101:104] <- table(substr(del_list[[i]]$REF,2,2))[c("A","C","G","T")]
print(i)
}
COAD.mutation.counts[is.na(COAD.mutation.counts)] <- 0
sub_list <- sapply(vcf_list_STAD, function(vcf) {
vcf[width(vcf$REF)==1 & width(unlist(vcf$ALT))==1,]
})
del_list <- sapply(vcf_list_STAD, function(vcf) {
vcf[width(vcf$REF)==2 & width(unlist(vcf$ALT))==1,]
})
ins_list <- sapply(vcf_list_STAD, function(vcf) {
vcf[width(vcf$REF)==1 & width(unlist(vcf$ALT))==2,]
})
STAD.mutation.counts = matrix(0,nrow=length(vcf_list_STAD),ncol=length(all.types),dimnames=list(names(vcf_list_STAD),all.types))
for (i in 1:nrow(STAD.mutation.counts)) {
type_context = type_context(sub_list[[i]], ref_genome)
counts <- table(type_context)
for (a in rownames(counts)) {
tmp = unlist(strsplit(a,split="[>]"))
inds <- colnames(counts)[counts[a,]>0]
columns = as.vector(sapply(inds, function(x) paste(substr(x,1,1),"[",a,"]",substr(x,nchar(x),nchar(x)),sep="")))
STAD.mutation.counts[i,columns] = counts[a,inds]
}
STAD.mutation.counts[i,97:100] <- table(substr(unlist(ins_list[[i]]$ALT),2,2))[c("A","C","G","T")]
STAD.mutation.counts[i,101:104] <- table(substr(del_list[[i]]$REF,2,2))[c("A","C","G","T")]
print(i)
}
STAD.mutation.counts[is.na(STAD.mutation.counts)] <- 0
Or just upload the prepared data:
load("ICGC data/profiles_and_decomposition.RData")
Signature extraction from the whole set is performed via Brunet version non-negative matrix factorization using NMF package. The number of signatures is chosen as the number of signatures where both residual sum of squares and Akakike Information Criterion values stabilize.
Plot AIC:
df = data.frame(rank=2:12,AIC)
ggplot(df,aes(x=rank,y=AIC)) + geom_point() +
ggtitle("AIC for selecting the number of signatures") +
geom_hline(yintercept = AIC[7],linetype = "longdash",colour="red")
Now plot RSS per rank:
df = data.frame(rank=2:12,rss)
ggplot(df,aes(x=rank,y=rss)) + geom_point() +
ggtitle("RSS for selecting the number of signatures") +
geom_hline(yintercept = rss[7],linetype = "longdash",colour="red")
This is how the final set of signatures looks like:
res <- NMF::nmf(x=mut_mat,rank=8,seed=123456,method='brunet')
sigs <- NMF::basis(res)
sigs <- sigs[,c(8,1,2,5,4,7,3,6)] # Reorder the signatures
decomposition <- t(NMF::coef(res))
decomposition <- decomposition[,c(8,1,2,5,4,7,3,6)]
colnames(sigs) <- c("Clock-1", "Clock-2", "POLE", "17-like", "MMR-1", "MMR-2", "MMR-3", "SNP")
colnames(decomposition) <- colnames(sigs)
for (i in 1:nrow(decomposition))
decomposition[i,] = decomposition[i,]/sum(decomposition[i,])
plot_sig_104(sigs,size=12)
Signatures are assigned as follows: Clock-1 (5meC), Clock-2 (+APOBEC), POLE, 17-like, MMR-1 (20), MMR-2 (15), MMR-3 (21), MMR-4 (unknown).
Get microsatellite stability/instability (MSS/MSI status) from UCSC (or rather the Cancer Genome Atlas Clinical Explorer):
icgc.coad <- rownames(COAD.mutation.counts)
icgc.stad <- rownames(STAD.mutation.counts)
coad <- read.delim('ICGC data/COAD/donor.tsv',header=T)
stad <- read.delim('ICGC data/STAD/donor.tsv',header=T)
tcga.coad <- as.character(coad$submitted_donor_id[match(icgc.coad, coad$icgc_donor_id)])
tcga.stad <- as.character(stad$submitted_donor_id[match(icgc.stad, stad$icgc_donor_id)])
mss_coadread <- read.delim('ICGC data/COAD/COADREAD_2015-04-02_ClinicalParameters.txt',header=T)
mss_stad <- read.delim('ICGC data/STAD/STAD_2015-04-02_ClinicalParameters.txt',header=T)
mmr.coad <- icgc.coad[match(intersect(mss_coadread$SampleCode[mss_coadread$MSIstatus=='MSI-H'], tcga.coad),tcga.coad)]
mmr.stad <- icgc.stad[match(intersect(mss_stad$SampleCode[mss_stad$MSIstatus=='MSI-H'], tcga.stad),tcga.stad)]
mmr.ucsc <- c(mmr.stad, mmr.coad)
To assess the distribution of signatures in the data, we generate tSNE plot based on cosine similarity between sample profiles. Circle sizes reflect number of mutations, black rim - microsatellite instability (MSI high), coloured sectors correspond to relative contributions of different signatures. Note that all MSI samples group in one cluster across both datasets.
Visualize the similarity map:
Let’s plot signature decomposition for MSI samples in both datasets:
Association with MSI status: calculate P-values for relative signature contributions compared with MSI status.
pvals <- NULL
for (i in 1:8)
{
pvals <- c(pvals, wilcox.test(decomposition[mmr.ucsc,i], decomposition[setdiff(rownames(mm),mmr.ucsc),i], paired=F,alternative='greater')$p.value)
}
names(pvals) = colnames(decomposition)
print(p.adjust(pvals,method = 'bonf')) # Take 0.01 as significance threshold (>10 tests)
## Clock-1 Clock-2 POLE 17-like MMR-1
## 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.720233e-51
## MMR-2 MMR-3 SNP
## 2.695689e-22 9.806070e-29 1.000000e+00
MMR-1 seems to be a good indicator of MSI status. Calculate AUC for MMR-1:
library(pROC)
mmr.type <- sapply(rownames(mm), function(x) {
if (x %in% mmr.ucsc) return("MSI")
else return("MSS")
})
roc(response = factor(mmr.type), predictor = decomposition[,5])
##
## Call:
## roc.default(response = factor(mmr.type), predictor = decomposition[, 5])
##
## Data: decomposition[, 5] in 103 controls (factor(mmr.type) MSI) > 402 cases (factor(mmr.type) MSS).
## Area under the curve: 0.9847
ci(response = factor(mmr.type), predictor = decomposition[,5])
## 95% CI: 0.969-1 (DeLong)
Boxplots for signature MMR-1, MMR-2, MMR-3 contribution vs MSI status:
## Using sample, mmr.type as id variables
As we can see from the signature plot, all indels are coming with MMR-1; to confirm it, plot the absolute contribution of MMR signatures vs the number of indels:
To see the unusual contribution of Clock-1 signature, plot the average fold change (log scale) in the number of mutations assigned to different signatures in MSI samples and MSS samples:
Relative contributions for MMR deficient samples:
Absolute contributions for all the samples in both datasets:
Plot the amount of indels per sample in both cohorts:
Lets find all the homopolymers of length 4 to 55 in human exome.
homopolymer_pool_length_4to55 <- lapply(seq(4,55),function(y) lapply(c("A","C","G","T"),function(x)
if (x=="A") {paste0("B",paste(rep(x,y),collapse=""),"B")
}
else
if (x=="T") {paste0("V",paste(rep(x,y),collapse=""),"V")
}
else
if (x=="G") {paste0("H",paste(rep(x,y),collapse=""),"H")
}
else
if (x=="C") {paste0("D",paste(rep(x,y),collapse=""),"D")
}
))
homopolymers <- unlist(homopolymer_pool_length_4to55)
pattern_all <- DNAStringSet(homopolymers)
hits_all <- list()
for (i in 1:23) {
maskMotif(a[[i]], "N") -> masked
hits_all[[i]] <- sapply(pattern_all,
matchPattern,
subject=masked,
fixed=F)
}
lengths <- lapply(hits_all, function(l) sapply(l,length))
hits_all <- lapply(1:length(hits_all), function(i) hits_all[[i]][which(lengths[[i]]>0)])
nonzero_lengths <- lapply(hits_all, function(chr) sapply(chr, length)) # numbers of homopolymers on each chromosome for non empty classes only
names(hits_all) <- seqlevels(gr)
sites.gr <- lapply(1:23, function(i) do.call("c",lapply(lapply(hits_all[[i]], as, "IRanges"),
GRanges,seqnames=names(hits_all)[i])))
for (j in 1:23) {
sites.gr[[j]]$pattern.searched <- rep(as.character(pattern_all), lengths[[j]])
sites.gr[[j]]$motif.found <- unlist(lapply(hits_all[[j]], as.character))
sites.gr[[j]]$pattern.length <- unlist(lapply(hits_all[[j]], width)) # includes the two flanking bases
sites.gr[[j]]$homopolymer.length <- (sites.gr[[j]]$pattern.length)-2 # remove 2 bases, 5' and 3' are not part of homopolymer
}
sites.gr_all <- do.call("c",sites.gr)
genome(sites.gr_all) <- "hg19" # add genome info
exomranges <- GRanges(seqnames=exome$V1,ranges=IRanges(start = exome$V2,end=exome$V3))
hits <- findOverlaps(exomranges,sites.gr_all,minoverlap = 2)
sites.gr_all <- sites.gr_all[sort(unique(subjectHits(hits)))]
all_sites <- as.data.frame(sites.gr_all) # make a data frame
# Upload the vcf files for both cohorts
load("ICGC data/STAD/vcf_list_STAD.RData")
indel_list_STAD <- sapply(vcf_list_STAD, function(vcf) vcf[abs(width(vcf$REF)-width(unlist(vcf$ALT)))==1 & (width(vcf$REF)==1 | width(unlist(vcf$ALT))==1),])
dims <- sapply(indel_list_STAD,length,simplify = T)
load("ICGC data/COAD/vcf_list_COAD.RData")
indel_list_COAD <- sapply(vcf_list_COAD, function(vcf) vcf[abs(width(vcf$REF)-width(unlist(vcf$ALT)))==1 & (width(vcf$REF)==1 | width(unlist(vcf$ALT))==1),])
dims <- sapply(indel_list_COAD,length,simplify = T)
for (k in 1:length(indel_list_STAD)) {
genome(indel_list_STAD[[k]]) <- "hg19"
indel_list_STAD[[k]]$insertion <- width(unlist(indel_list_STAD[[k]]$ALT))-1
indel_list_STAD[[k]]$deletion <- width(indel_list_STAD[[k]]$REF)-1
}
for (k in 1:length(indel_list_COAD)) {
genome(indel_list_COAD[[k]]) <- "hg19"
indel_list_COAD[[k]]$insertion <- width(unlist(indel_list_COAD[[k]]$ALT))-1
indel_list_COAD[[k]]$deletion <- width(indel_list_COAD[[k]]$REF)-1
}
indels.in.hp.stad <- vector("numeric",length(indel_list_STAD))
for (i in 1:length(indel_list_STAD)) {
x <- subsetByOverlaps(indel_list_STAD[[i]],sites.gr_all)
indels.in.hp.stad[i] <- nrow(as.data.frame(x))
}
names(indels.in.hp.stad) <- names(indel_list_STAD)
indels.in.hp.coad <- vector("numeric",length(indel_list_COAD))
for (i in 1:length(indel_list_COAD)) {
x <- subsetByOverlaps(indel_list_COAD[[i]],sites.gr_all)
indels.in.hp.coad[i] <- nrow(as.data.frame(x))
}
names(indels.in.hp.coad) <- names(indel_list_COAD)
Plot indels in homopolymers:
Same boxplots for realtive contributions in MSI and MSS samples for all signatures simultaneously:
Absolute contributions:
We calculate cosine similarities between COSMIC signatures and de novo signature set; and also between C.elegans signatures and all if the abovementioned. Cosine similarity score measures the cosine of an angle between two vectors:
\(Sim(a,b) = \frac{<a,b>}{||a||*||b||}\)
cosine <- function(x,y) {
x %*% y / sqrt(sum(x**2)) / sqrt(sum(y**2))
}
The higher the similarity, the closer the vectors are to each other.
Similarities between de novo signatures and C. elegans derived mlh-1, pms-2 and pole-4;pms-2 mutational patterns:
## mlh.1 pms.2 pole.4.pms.2
## Clock-1 0.15 0.18 0.17
## Clock-2 0.41 0.36 0.32
## POLE 0.17 0.19 0.21
## 17-like 0.08 0.23 0.21
## MMR-1 0.41 0.81 0.85
## MMR-2 0.17 0.36 0.43
## MMR-3 0.29 0.62 0.56
## SNP 0.27 0.52 0.47
Similarities between de novo signatures and COSMIC cancer signatures:
## Signature.1 Signature.2 Signature.3 Signature.4 Signature.5
## Clock-1 0.98 0.11 0.14 0.12 0.53
## Clock-2 0.25 0.62 0.75 0.55 0.66
## POLE 0.32 0.26 0.19 0.17 0.31
## 17-like 0.06 0.04 0.22 0.06 0.26
## MMR-1 0.59 0.08 0.43 0.43 0.71
## MMR-2 0.67 0.04 0.14 0.11 0.49
## MMR-3 0.39 0.09 0.34 0.19 0.62
## SNP 0.73 0.14 0.48 0.35 0.83
## Signature.6 Signature.7 Signature.8 Signature.9 Signature.10
## Clock-1 0.86 0.23 0.42 0.39 0.36
## Clock-2 0.18 0.57 0.63 0.47 0.13
## POLE 0.24 0.41 0.32 0.39 0.97
## 17-like 0.08 0.05 0.16 0.61 0.06
## MMR-1 0.79 0.17 0.55 0.52 0.24
## MMR-2 0.86 0.09 0.34 0.34 0.21
## MMR-3 0.42 0.17 0.35 0.48 0.16
## SNP 0.69 0.32 0.52 0.60 0.25
## Signature.11 Signature.12 Signature.13 Signature.14 Signature.15
## Clock-1 0.16 0.12 0.03 0.63 0.52
## Clock-2 0.54 0.39 0.50 0.26 0.12
## POLE 0.13 0.12 0.08 0.25 0.18
## 17-like 0.05 0.27 0.02 0.07 0.06
## MMR-1 0.35 0.46 0.04 0.85 0.70
## MMR-2 0.25 0.14 0.03 0.87 0.99
## MMR-3 0.26 0.79 0.03 0.40 0.35
## SNP 0.38 0.62 0.06 0.52 0.38
## Signature.16 Signature.17 Signature.18 Signature.19 Signature.20
## Clock-1 0.12 0.20 0.25 0.48 0.53
## Clock-2 0.68 0.19 0.45 0.52 0.31
## POLE 0.24 0.18 0.37 0.19 0.24
## 17-like 0.36 0.98 0.04 0.08 0.12
## MMR-1 0.47 0.20 0.35 0.62 0.85
## MMR-2 0.14 0.12 0.20 0.52 0.30
## MMR-3 0.54 0.18 0.16 0.41 0.53
## SNP 0.61 0.29 0.31 0.65 0.79
## Signature.21 Signature.22 Signature.23 Signature.24 Signature.25
## Clock-1 0.42 0.07 0.15 0.12 0.47
## Clock-2 0.19 0.22 0.39 0.44 0.55
## POLE 0.17 0.03 0.09 0.15 0.33
## 17-like 0.12 0.07 0.03 0.03 0.20
## MMR-1 0.43 0.08 0.43 0.32 0.50
## MMR-2 0.22 0.05 0.32 0.21 0.33
## MMR-3 0.95 0.08 0.28 0.14 0.45
## SNP 0.63 0.13 0.39 0.25 0.60
## Signature.26 Signature.27 Signature.28 Signature.29 Signature.30
## Clock-1 0.46 0.16 0.08 0.45 0.38
## Clock-2 0.27 0.25 0.13 0.49 0.68
## POLE 0.23 0.09 0.21 0.27 0.21
## 17-like 0.20 0.04 0.73 0.05 0.11
## MMR-1 0.65 0.22 0.17 0.48 0.50
## MMR-2 0.49 0.18 0.06 0.33 0.38
## MMR-3 0.91 0.16 0.18 0.28 0.36
## SNP 0.71 0.21 0.23 0.47 0.55
Check if contribution of any signature correlates with age (answer - no).
Correlation with relative contributions:
## Clock-1 Clock-2 POLE 17-like MMR-1 MMR-2
## 0.06675453 -0.02832896 -0.01666071 -0.04382598 0.12239589 0.11216179
## MMR-3 SNP
## 0.03552525 -0.01160837
Correlation with absolute contributions:
## Clock-1 Clock-2 POLE 17-like MMR-1 MMR-2
## 0.21618104 0.21717069 0.15605675 0.07358448 0.19466476 0.20038145
## MMR-3 SNP
## 0.11834584 0.11104834
Plot averaged profiles of MSI samples in COAD and STAD datasets. Their cosine similarity shows that they are nearly identical.
## [,1]
## [1,] 0.9946953